I. Introduction

II. Data Loading and Feature Engineering

2.1 Data Loading

## 1. occupancy data
occupancy = read.csv("trains_train.csv") 

## 2. station data
stations0 <- read.csv("stations.csv")
stations.sf <- 
  stations0 %>% 
  st_as_sf(coords = c("longitude", "latitude"), crs = 4326, agr = "constant") %>%
  st_transform(31370)

stations <- stations.sf %>%
  separate(URI, c("res", "station_name"), "NMBS/")%>%
  select(station_name,name,country.code,avg_stop_times,
         geometry)


### Check station distribution
belgium_basemap <- get_stamenmap(bbox = c(left = 2.739258, bottom = 49.563228,
                                          right = 6.551513, top = 51.695260), 
                                 zoom = 11)
ggmap(belgium_basemap)+
  geom_point(aes(x = longitude, y = latitude), data = stations0, size = 1, color ='darkred') +
  labs(title="Train Station Distribution in Belgium\n",
       caption = 'Figure 2.1',
       x="Longtitude", 
       y="Latitude")+
  plotTheme()

## 3. Weather data
W701 <- read.csv("weather_data_july_1.csv")
W702 <- read.csv("weather_data_july_2.csv")
W801 <- read.csv("weather_data_aug_1.csv")
W802 <- read.csv("weather_data_aug_2.csv")
W901 <- read.csv("weather_data_sep_1.csv")
W902 <- read.csv("weather_data_sep_2.csv")
W101 <- read.csv("weather_data_oct_1.csv")
W102 <- read.csv("weather_data_oct_2.csv")

weather <- rbind(W701,W702,W801,W802,W901,W902,W101,W102) %>%
  mutate(interval60 = ymd_hms(date_time))



## 4. Demographic data: population density
### population
### https://ec.europa.eu/eurostat/cache/metadata/en/demo_r_gind3_esms.htm
### topo data
### https://gisco-services.ec.europa.eu/distribution/v1/nuts-2016.html
bel <- st_read("NUTS_RG_60M_2016_4326_LEVL_3.shp") 
## Reading layer `NUTS_RG_60M_2016_4326_LEVL_3' from data source `/Users/penguin/Box Sync/GitHub/Final Project/NUTS_RG_60M_2016_4326_LEVL_3.shp' using driver `ESRI Shapefile'
## Simple feature collection with 1522 features and 9 fields
## geometry type:  MULTIPOLYGON
## dimension:      XY
## bbox:           xmin: -61.841 ymin: -21.376 xmax: 55.85 ymax: 71.178
## geographic CRS: WGS 84
popdens <- read.csv("demo_r_d2jan_1_Data.csv")
belpopden <- bel %>%
  left_join(dplyr::select(popdens, c(GEO, Value)), by = c("FID" = "GEO")) %>%
  st_transform(31370)



## 5. Facility data

facility <- read.csv("facilities.csv")
facility[is.na(facility)] <- 0 

facility <- 
  facility %>% 
  mutate(facility_sum = select(.,ticket_vending_machine:audio_induction_loop) %>% 
           rowSums()) %>% 
  select(name,facility_sum)

stations <- merge(stations, facility,by.x="name",all.x=TRUE) 

## 6. Railway data
### https://mapcruzin.com/free-belgium-arcgis-maps-shapefiles.htm
rail <- st_read("belgium-railways-shape/railways.shp") %>%
  st_transform(4326)
## Reading layer `railways' from data source `/Users/penguin/Box Sync/GitHub/Final Project/belgium-railways-shape/railways.shp' using driver `ESRI Shapefile'
## Simple feature collection with 3231 features and 3 fields
## geometry type:  LINESTRING
## dimension:      XY
## bbox:           xmin: 2.558077 ymin: 49.54788 xmax: 6.363977 ymax: 51.45972
## geographic CRS: WGS 84
## 7. Line Info
lines <- read.csv('line_info.csv')


# Initial Dataset

data <-
  occupancy %>%
  mutate(from = as.character(from), to = as.character(to)) %>%
  left_join(dplyr::select(stations, station_name), by = c("from" = "station_name")) %>%
  st_sf() %>% 
  mutate(from.X = st_coordinates(.)[,1], from.Y = st_coordinates(.)[,2]) %>%
  st_drop_geometry() %>%
  left_join(dplyr::select(stations, station_name), by = c("to" = "station_name")) %>%
  st_sf() %>% 
  mutate(to.X = st_coordinates(.)[,1], to.Y = st_coordinates(.)[,2]) %>%
  st_drop_geometry() %>%
  mutate(distance = sqrt((from.X - to.X)^2 + (from.Y - to.Y)^2)) %>%
  arrange(-distance) 


data <- data %>%
  mutate(datetime <- paste(date,time),
         datetime = parse_date_time(datetime, '%y-%m-%d %I:%M:%S %p'),
         interval60 = floor_date(ymd_hms(datetime), unit = "hour"),
         interval15 = floor_date(ymd_hms(datetime), unit = "15 mins"),
         week = week(interval60),
         dotw = wday(interval60)) %>%
  select(-`datetime <- paste(date, time)`)

data <- data %>%
  mutate(day = case_when(
     dotw==1~"Mon",
    dotw==2~"Tue",
    dotw==3~"Wed",
    dotw==4~"Thu",
    dotw==5~"Fri",
    dotw==6~"Sat",
    dotw==7~"Sun"
  )) %>%
  select(-dotw) 


data$vehicle_type <- gsub("[^a-zA-Z]", "", data$vehicle) 
data$vehicle_type[data$vehicle_type==""] <- "undefined"
data$vehicle_type[data$vehicle_type=="ic"] <- "ICE"

## final data: departure station data
data_f_0 <-  merge(data, stations,  by.x = "from", by.y = "station_name",all.x = TRUE) %>%
   st_as_sf()

data_f_00 <- merge(data_f_0,weather,by.x=c("name","interval60"),by.y=c("station_name","interval60"),all.x = TRUE)


## final data: destination station data
data_t_0 <-  merge(data, stations,  by.x = "to", by.y = "station_name",all.x = TRUE) %>%
   st_as_sf()

data_t_00 <- merge(data_t_0,weather,by.x=c("name","interval60"),by.y=c("station_name","interval60"),all.x = TRUE)

2.2 Feature Engineering

2.2.1 Station Density

Calculate the distance of nearest three stations around each station and show the outcome by occupancy level.

stations_copy <- stations

stations_copy$stationsdist <- nn_function(st_coordinates(stations), st_coordinates(stations), 3)

2.2.2 Distance of Stations to Major Cities

Codes below calculate the distances of each station to the five major cities: Brussels,Liege,Charleroi, Gent, and Antwerpen measured by population size.

name <- c("Brussels","Liege","Charleroi", "Gent", "Antwerpen")
lat <- c(50.85045,50.63373, 50.41136, 51.05, 51.21989)
lon <- c(4.34878, 5.56749, 4.44448, 3.71667, 4.40346)

majorcities <- tibble(name, lat, lon) %>%
  st_as_sf(coords = c("lon","lat"), crs = 4326, agr = "constant") %>%
  st_transform(31370)

stations_copy$majc_nn5 <- nn_function(st_coordinates(stations), st_coordinates(majorcities), 5)

stations_copy <-
  stations_copy %>%
  mutate(Brussels = sqrt((st_coordinates(.)[,1] - st_coordinates(majorcities)[1,1])^2 + (st_coordinates(.)[,2] - st_coordinates(majorcities)[1,2])^2)) %>%
  mutate(Liege = sqrt((st_coordinates(.)[,1] - st_coordinates(majorcities)[2,1])^2 + (st_coordinates(.)[,2] - st_coordinates(majorcities)[2,2])^2)) %>%
  mutate(Charleroi = sqrt((st_coordinates(.)[,1] - st_coordinates(majorcities)[3,1])^2 + (st_coordinates(.)[,2] - st_coordinates(majorcities)[3,2])^2)) %>%
  mutate(Gent = sqrt((st_coordinates(.)[,1] - st_coordinates(majorcities)[4,1])^2 + (st_coordinates(.)[,2] - st_coordinates(majorcities)[4,2])^2)) %>%
  mutate(Antwerpen = sqrt((st_coordinates(.)[,1] - st_coordinates(majorcities)[5,1])^2 + (st_coordinates(.)[,2] - st_coordinates(majorcities)[5,2])^2)) 

2.2.3 Occupancy Level Proportion of Each Station

Codes below are to calculate the proportion of each occupancy level for each origin station and destination station respectively.

## Origin Station
data_f_00_h <- data_f_00 %>%
  filter(occupancy=='high')

data_f_00_m <- data_f_00 %>%
  filter(occupancy=='medium')

data_f_00_l <- data_f_00 %>%
  filter(occupancy=='low')


sta_from_total <- as.data.frame(table(data_f_00$from)) %>% 
  filter(Var1!='(null)') %>% 
  mutate(station_f=as.character(Var1)) %>%
  mutate(F_total=Freq) %>%
  select(3,4)

sta_from_h <- as.data.frame(table(data_f_00_h$from)) %>% 
  filter(Var1!='(null)') %>% 
  mutate(station_f=as.character(Var1)) %>%
  mutate(F_H_freq=Freq) %>%
  select(3,4)

sta_from_m <- as.data.frame(table(data_f_00_m$from)) %>% 
  filter(Var1!='(null)') %>% 
  mutate(station_f=as.character(Var1)) %>%
  mutate(F_M_freq=Freq) %>%
  select(3,4) 

sta_from_l <- as.data.frame(table(data_f_00_l$from)) %>% 
  filter(Var1!='(null)') %>% 
  mutate(station_f=as.character(Var1)) %>%
  mutate(F_L_freq = Freq) %>%
  select(3,4)  


## Destination Station
data_t_00_h <- data_t_00 %>%
  filter(occupancy=='high')

data_t_00_m <- data_t_00 %>%
  filter(occupancy=='medium')

data_t_00_l <- data_t_00 %>%
  filter(occupancy=='low')


sta_to_total <- as.data.frame(table(data_t_00$to)) %>% 
  filter(Var1!='(null)') %>% 
  mutate(station_t = as.character(Var1)) %>%
  mutate(T_total = Freq) %>%
  select(3,4)

sta_to_h <- as.data.frame(table(data_t_00_h$to)) %>% 
  filter(Var1!='(null)') %>% 
  mutate(station_t = as.character(Var1)) %>%
  mutate(T_H_freq=Freq) %>%
  select(3,4)

sta_to_m <- as.data.frame(table(data_t_00_m$to)) %>% 
  filter(Var1!='(null)') %>% 
  mutate(station_t=as.character(Var1)) %>%
  mutate(T_M_freq=Freq) %>%
  select(3,4) 

sta_to_l <- as.data.frame(table(data_t_00_l$to)) %>% 
  filter(Var1!='(null)') %>% 
  mutate(station_t=as.character(Var1)) %>%
  mutate(T_L_freq = Freq) %>%
  select(3,4)  


stations_occ <- merge(stations,sta_from_total,by.x="station_name",by.y='station_f',all.x=T)
stations_occ <- merge(stations_occ,sta_from_h,by.x="station_name",by.y='station_f',all.x=T)
stations_occ <- merge(stations_occ,sta_from_m,by.x="station_name",by.y='station_f',all.x=T)
stations_occ <- merge(stations_occ,sta_from_l,by.x="station_name",by.y='station_f',all.x=T)

stations_occ <- merge(stations_occ,sta_to_total,by.x="station_name",by.y='station_t',all.x=T)
stations_occ <- merge(stations_occ,sta_to_h,by.x="station_name",by.y='station_t',all.x=T)
stations_occ <- merge(stations_occ,sta_to_m,by.x="station_name",by.y='station_t',all.x=T)
stations_occ <- merge(stations_occ,sta_to_l,by.x="station_name",by.y='station_t',all.x=T)

stations_occ$F_H_com <- stations_occ$F_H_freq/stations_occ$F_total
stations_occ$F_M_com <- stations_occ$F_M_freq/stations_occ$F_total
stations_occ$F_L_com <- stations_occ$F_L_freq/stations_occ$F_total
stations_occ$T_H_com <- stations_occ$T_H_freq/stations_occ$T_total
stations_occ$T_M_com <- stations_occ$T_M_freq/stations_occ$T_total
stations_occ$T_L_com <- stations_occ$T_L_freq/stations_occ$T_total
stations_occq <- select(stations_occ,1,14,15,16,17,18,19,20)

2.2.4 Average Occupancy Level Proportion by Station Group

Codes below calculate the average occupancy level proportion of the nearest three stations of each stations.

neighborList <- knn2nb(knearneigh(st_coordinates(stations), 3))
occupancy$from <- as.character(occupancy$from)
occupancy$to <- as.character(occupancy$to)

getnb_from <- function(x,i, from, occ){
    nbcount = occupancy %>%
      left_join(dplyr::select(stations_copy, c(station_name, stationsdist)), by = c(from = "station_name")) %>%
      filter(from %in% stations$station_name[neighborList[[i]]]) %>%
      tally()
    nbcount1 = occupancy %>%
      left_join(dplyr::select(stations_copy, c(station_name, stationsdist)), by = c(from = "station_name")) %>%
      filter(from %in% stations$station_name[neighborList[[i]]] & occupancy == occ) %>%
      tally()
    nbcount2 = nbcount1/nbcount
  return(nbcount2[,1])
}

#avg composition of low occupancy from neighbor from stations
nblist = list()
for (i in 1:length(neighborList)){
  nblist[[i]] = getnb_from(neighborList,i,"from","low")
}
stations_copy$nb_occ_f_low <- nblist

#avg composition of medium occupancy from neighbor from stations
nblist = list()
for (i in 1:length(neighborList)){
  nblist[[i]] = getnb_from(neighborList,i,"from","medium")
}
stations_copy$nb_occ_f_medium <- nblist

#avg composition of high occupancy from neighbor from stations
nblist = list()
for (i in 1:length(neighborList)){
  nblist[[i]] = getnb_from(neighborList,i,"from","high")
}
stations_copy$nb_occ_f_high <- nblist

#avg composition of low occupancy from neighbor to stations
nblist = list()
for (i in 1:length(neighborList)){
  nblist[[i]] = getnb_from(neighborList,i,"to","low")
}
stations_copy$nb_occ_t_low <- nblist


#avg composition of medium occupancy from neighbor to stations
nblist = list()
for (i in 1:length(neighborList)){
  nblist[[i]] = getnb_from(neighborList,i,"to","medium")
}
stations_copy$nb_occ_t_medium <- nblist


#avg composition of high occupancy from neighbor to stations
nblist = list()
for (i in 1:length(neighborList)){
  nblist[[i]] = getnb_from(neighborList,i,"to","high")
}
stations_copy$nb_occ_t_high <- nblist

2.2.5 Station Popularity

Then we explore the popularity of each station, in other words, we can calculate the number of lines connected to each origin station and destination station respectively.

connect <- function(line,stt){
  stt <- as.data.frame(stt) %>% 
    select(1)
  stt$connect <- NA

  for(i in seq(1:nrow(stt))){
    if(length(table(line$stops==stt[i,1]))==2){
      line_i <- line %>% 
        filter(stops==stt[i,1])
      stt$connect[i] <- nrow(line_i)
      }
    }
  return(stt)
}


lines_connect <- lines %>% 
  select(5,7) %>% 
  mutate(stops = gsub("\\[","",stopping_station_ids)) %>% 
  mutate(stops = gsub("\\]","",stops)) %>% 
  mutate(stops = gsub("'",  "",stops)) %>% 
  mutate(stops = gsub(" ",  "",stops)) %>% 
  unique() %>% 
  select(3) %>% 
  separate_rows(stops, sep=",") %>%
  group_by(stops) %>%
  summarize(connections = n())

stations_occ_line <- merge(stations_occq, lines_connect, 
                           by.x = "station_name", by.y = "stops", all.x = TRUE)

stations_final <- merge(st_drop_geometry(stations_copy),
                        st_drop_geometry(stations_occ_line),
                        by = "station_name") %>%
  select(-name,-country.code,-avg_stop_times,-facility_sum)

2.2.6 Occupancy Level Time Lag

Creating time lag variables will add additional nuance about the demand during a given time period. Since the period of given data barely covers holidays, thus this analysis creates lagHour, lag2Hours,lag3Hours, lag4Hours, and lag12Hours to show occupancy level of each shift before and after a certain time whth holiday effect ruled out.

III. Exploratory Data Analysis

3.1 Stations Characteristics

3.1.1 Station Density by Occupancy Level

ggplot() +
  geom_sf(data = rail, color = "grey") +  
  geom_sf(data = data_f %>%
            filter(country.code == 'be'), 
          aes(color = stationsdist) ,size = 0.5)+
  labs(title="Station Density by Occupancy Level",
       subtitle = 'Distance of each station to the nearest three stations\n',
       caption = 'Figure')+ 
  scale_colour_viridis(direction = -1,
  discrete = FALSE, option = "plasma",
  name = 'Counts')+
  facet_wrap(~occupancy, ncol = 3) + 
  mapTheme()+
  plotTheme()  +
  theme(legend.position = "bottom")

3.1.2 Distance of Stations to Major Cities

ggplot() +
  geom_sf(data = rail, color = "grey") +  
  geom_sf(data = data_f %>%
            filter(country.code == 'be'), 
          aes(color = majc_nn5) ,size = 0.5)+
  labs(title="Distance of Stations to Major Cities",
       subtitle = ' ',
       caption = 'Figure')+ 
  scale_colour_viridis(direction = -1,
  discrete = FALSE, option = "plasma",
  name = 'Counts')+
  facet_wrap(~occupancy, ncol = 3) + 
  mapTheme()+
  plotTheme()  

3.1.3 High Occupancy Level Proportion of Each Station

ggplot() +
  geom_sf(data = rail, color = "grey") +  
  geom_sf(data = data_f %>%
            filter(country.code == 'be'), 
          aes(color = F_H_com) ,size = 0.5)+
  labs(title="High Occupancy Level Proportion of Each Station",
       subtitle = ' ',
       caption = 'Figure')+ 
  scale_colour_viridis(direction = -1,
  discrete = FALSE, option = "plasma",
  name = 'Ratio')+
  mapTheme()+
  plotTheme()  

3.1.4 Station Popularity

ggplot() +
  geom_sf(data = rail, color = "grey") +  
  geom_sf(data = data_f %>%
            filter(country.code == 'be'), 
          aes(color = connections) ,size = 0.5)+
  labs(title="Station Popularity across Nation",
       subtitle = 'Counts of connections of each station\n',
       caption = 'Figure')+ 
  scale_colour_viridis(direction = -1,
  discrete = FALSE, option = "plasma",
  name = 'Counts')+
  mapTheme()+
  plotTheme() 

3.1.5 Facility Characteristics

ggplot() +
  geom_sf(data = rail, color = "grey") +  
  geom_sf(data = data_f %>%
            filter(country.code == 'be'), 
          aes(color = facility_sum) ,size = 0.5)+
  labs(title="Facility Counts across Stations",
       subtitle = 'Departure Station\n',
       caption = 'Figure')+ 
  scale_colour_viridis(direction = -1,
  discrete = FALSE, option = "plasma",
  name = 'Counts')+
  mapTheme()+
  plotTheme() 

3.2 Weather characteristics

grid.arrange(
  ggplot(data_f, aes(interval60,humidity)) + geom_line(color = "#6897BB") +
    labs(title="Weather characteristics: Percipitation",
         x="Hour", y="Perecipitation") + 
    plotTheme(),
  ggplot(data_f, aes(interval60,windspeed)) + geom_line(color = "#6897BB") +
    labs(title="Weather characteristics: Wind Speed", 
         x="Hour", y="Wind Speed") + 
    plotTheme(),
  ggplot(data_f, aes(interval60,temperature)) + geom_line(color = "#6897BB") +
    labs(title="Weather characteristics: Temperature",
         x="Hour", y="temperature",
         caption="Figure 3.1") + 
    plotTheme())

3. Serial Autocorrelation

ggplot(data_f %>% 
         mutate(hour = hour(datetime)))+
  geom_freqpoly(aes(hour, color = occupancy), binwidth = 1)+
  scale_colour_manual(values = palette3) +
  labs(title="Train Shift Counts by Occupancy Level (General)",
       subtitle = 'Belgium, 2016\n',
       caption = "Figure",
       
       x="Hour of the Day", 
       y="Counts")+
  plotTheme()

ggplot(data_f %>% mutate(hour = hour(datetime),
                             weekend = ifelse(day %in% c("Sun", "Sat"), "Weekend", "Weekday"))) +
  geom_freqpoly(aes(hour, color = weekend), binwidth = 1)+
  scale_color_manual(values = palette2,
                     name = 'Period') +
  labs(title="Train Shift Counts by Occupancy Level (Week Periods)",
       subtitle = 'Belgium, 2016\n',
       caption = 'Figure ',
       x="Hour of the Day", 
       y="Counts")+
  facet_wrap(~occupancy,ncol=3) +
  plotTheme()

ggplot(data_f %>% mutate(hour = hour(datetime))) +
  geom_freqpoly(aes(hour, color = day), binwidth = 1)+
  scale_color_manual(values = palette7,
                     name = 'Week Day') +
  labs(title="Train Shift Counts by Occupancy Level (Week Days)",
       subtitle = 'Belgium, 2016\n',
       caption = 'Figure ',
       x="Hour of the Day", 
       y="Counts")+
  facet_wrap(~occupancy,ncol=3) +
  plotTheme()

4. Spatial Autocorrelation

test04 <- data %>%
  group_by(vehicle) %>%
  summarise(n = n()) %>%
  filter(n >= 12)
freq_v = c("IC1518","IC429","IC1515","IC407","P7305","IC1807","IC3631","1828","8015","IC1831","IC539","P7444","S83978", "IC1507","IC716","L557","S23665","IC3432","IC4317","S53586")

p1 <- ggplot() +
  geom_sf(data = rail, aes(color = "grey")) + 
  geom_sf(data=subset(data_f, data_f$vehicle %in% freq_v),aes(colour = vehicle),size=2,show.legend = "point")+ 
  labs(title="Top 20 Busiest Lines",
       subtitle = 'Departure Stations\n',
       caption = "Figure")+
  mapTheme()+
  plotTheme() +
  theme(legend.position = "bottom") + 
  transition_manual(factor(vehicle, levels = freq_v), cumulative = TRUE) 
 
gganimate::animate(p1,  duration=15,renderer = gifski_renderer())

## occupancy by lines: destination
p2 <- ggplot()+
  geom_sf(data=rail,aes(color = "grey"))+
  geom_sf(data=subset(data_t, data_t$vehicle %in% freq_v),aes(colour = vehicle),size=2,show.legend = "point")+ 
  labs(title="Top 20 Frequent Lines Plotted to Destination\n",
       caption = 'Figure')+
  mapTheme()+
  plotTheme() +
  theme(legend.position = "bottom") + 
  transition_manual(factor(vehicle, levels = freq_v), cumulative = TRUE)

gganimate::animate(p2,  duration=15,renderer = gifski_renderer())

5. Space/Time Correlation

ggplot() +
  geom_sf(data = rail, color = "grey") +  
  geom_sf(data = data_f %>%
            filter(country.code == 'be'), 
          aes(color = occupancy) ,size = 0.5)+
  labs(title="Occupancy Level across Departure Station by Week Days\n",
       caption = 'Figure')+ 
  facet_wrap(~day,nrow =2) +
  scale_colour_manual(values = palette3,
                      name = 'Occupancy') +
  mapTheme()+
  plotTheme() 

ggplot() +
  geom_sf(data = rail, color = "grey") +  
  geom_sf(data = data_t %>%
            filter(country.code == 'be'), 
          aes(color = occupancy) ,size = 0.5)+
  labs(title="Occupancy Level across Destination Station by Week Days\n",
       caption = 'Figure')+ 
  facet_wrap(~day,nrow =2) +
  scale_colour_manual(values = palette3,
                      name = 'Occupancy') +
  mapTheme()+
  plotTheme() 

6. Weather Correlation

ggplot(data_f %>% 
         mutate(hour = hour(datetime)) %>% 
         filter(occupancy == "high" | occupancy == "medium" | occupancy == "low"), 
       aes(temperature, color = day))+
  geom_freqpoly(binwidth = 1)+
  facet_wrap(~occupancy,ncol=3) +
  scale_colour_manual(values = palette7,
                      name = "Week Day") +
  labs(title="Train Shift Counts across Temperature by Occupancy Level",
       subtitle = "Belgium, 2016\n",
       caption = "Figure ",
       x="Temperature", 
       y="Counts") +
  plotTheme()

ggplot(data_f %>% 
         mutate(hour = hour(datetime)) %>% 
         filter(occupancy == "high" | occupancy == "medium" | occupancy == "low"), 
       aes(humidity, color = day))+
  geom_freqpoly(binwidth = 1)+
  facet_wrap(~occupancy,ncol=3) +
  scale_colour_manual(values = palette7,
                      name = "Week Day") +
  labs(title="Train Shift Counts across Humidity by Occupancy Level",
       subtitle = "Belgium, 2016\n",
       caption = "Figure ",
       x="Humidity", 
       y="Counts") +
  plotTheme()

ggplot(data_f %>% 
         mutate(hour = hour(datetime)) %>% 
         filter(occupancy == "high" | occupancy == "medium" | occupancy == "low"), 
       aes(windspeed, color = day))+
  geom_freqpoly(binwidth = 1)+
  facet_wrap(~occupancy,ncol=3) +
  scale_colour_manual(values = palette7,
                      name = "Week Day") +
  labs(title="Train Shift Counts across Windspeed by Occupancy Level",
       subtitle = "Belgium, 2016\n",
       caption = "Figure ",
       x="Windspeed", 
       y="Counts") +
  plotTheme()

ggplot(data_f %>% 
         mutate(hour = hour(datetime)) %>% 
         filter(occupancy == "high" | occupancy == "medium" | occupancy == "low"), 
       aes(visibility, color = day))+
  geom_freqpoly(binwidth = 1)+
  facet_wrap(~occupancy,ncol=3) +
  scale_colour_manual(values = palette7,
                      name = "Week Day") +
  labs(title="Train Shift Counts across Visibility by Occupancy Level",
       subtitle = "Belgium, 2016\n",
       caption = "Figure ",
       x="Visibility", 
       y="Counts") +
  plotTheme()